import numpy as np
import pandas as pd
from functools import partial
from IPython.display import HTML, display
from colorcet import fire
import datashader as ds
import dask.dataframe as dd
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_notebook, show
from datashader.bokeh_ext import InteractiveImage
from datashader import transfer_functions as tf
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from bokeh.models.tiles import WMTSTileSource
import geoviews as gv
import holoviews as hv
from holoviews.operation.datashader import aggregate, datashade
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv('Utah/Utah crashes_2010-2017.csv', parse_dates=[['CRASHDATE', 'CRASHTIME']])
df.head()
df.shape
df.iloc[1].RoadView
df_extr = df.RoadView.str.extract('.*\?db=(.*)&project=(.*)&run=(.*)&lat=(.*)&lon=(.*)')
df_extr.columns = ["db", "project", "run", "lat", "lon"]
df = pd.concat([df, df_extr], axis=1)
df.rename(columns={'CRASHDATE_CRASHTIME': 'DATE'}, inplace=True)
df['dayofyear'] = df.DATE.dt.dayofyear
df['year'] = df.DATE.dt.year
df['month'] = df.DATE.dt.month
df['day'] = df.DATE.dt.day
df['hour'] = df.DATE.dt.hour
df['dayofweek'] = df.DATE.dt.dayofweek
df['incident'] = 1
df.set_index('DATE', inplace=True)
df['lon'] = df.lon.astype(np.float)
df['lat'] = df.lat.astype(np.float)
df.Latitude.min(), df.Latitude.max()
df = df[df.Latitude>100]
df.shape
df.to_pickle('df_utah.pkl')
import calmap
calmap.calendarplot(df[df.CrashSeverity == 5]['incident'], linewidth=0, cmap='OrRd', fig_kws=dict(figsize=(18, 14)));
calmap.calendarplot(df['incident'], linewidth=0, cmap='OrRd', fig_kws=dict(figsize=(18, 14)));
We see a high number of crashes in Dec-Jan and low in Feb-Mar. Crashes on weekends are lower, escpecially on Sundays. Fridays seem to have the most number of crashes. However, the most severe crashes are not in winters, but instead in the 100 days of summer.
day_week_labels = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
day_week_hr_labels = [day_week_labels[day] + '-' + str(hour)
for day in range(0,7)
for hour in range(0,25)]
df['day_week_hour'] = df.dayofweek*25 + df.hour
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set_size_inches(50,6)
fig.add_subplot(111)
ax_day_week_hr = df['day_week_hour'].value_counts().sort_index().plot(kind='bar', color='k')
ax_day_week_hr.set_xticklabels(day_week_hr_labels);
ax = df['dayofweek'].value_counts().sort_index().plot(kind='bar', color='k')
ax.set_xticklabels(day_week_labels);
df['month'].value_counts().sort_index().plot(kind='bar', color='k')
df['hour'].value_counts().sort_index().plot(kind='bar', color='k')
df['incident'].resample('Y').sum().plot()
Accidents are increasing year or year with a dip in 2014
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1, 1, 1)
ax.plot(df['incident'].resample('M').sum(), 'k', label='Total crashes')
ax.plot(df[df.CrashSeverity>3]['incident'].resample('M').sum(), 'r', label='Severe crashes')
fig = plt.figure(figsize=(15,3))
ax1 = fig.add_subplot(1, 1, 1)
ax1.plot(df['incident'].resample('M').sum(), 'b-')
ax1.set_xlabel('years')
# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('All Crashes', color='b')
ax1.tick_params('y', colors='b')
ax2 = ax1.twinx()
ax2.plot(df[df.CrashSeverity==5]['incident'].resample('M').sum(), 'r-')
ax2.set_ylabel('Severe Crashes', color='r')
ax2.tick_params('y', colors='r')
fig.tight_layout()
plt.show()
The number of crashes is increasing every year, and there is a strong seasonality, with most crashes in Nov-Dec and least in Feb-Apr. Feb 2011 has unusually low crashes or missing data (more likely). However, the most severe crashes occur in the summer months.
df.groupby('ROADWAYJUNCTFEATUREID')['ROADWAYJUNCTFEATUREID'].count().sort_values(ascending=False)
agg = ds.Canvas(750,750).points(df, 'lon','lat')
tf.set_background(tf.shade(agg, cmap=fire),"black")
agg = ds.Canvas(750,750).points(df[df['CrashSeverity']>3], 'lon','lat')
tf.set_background(tf.shade(agg, cmap=fire),"black")
factors = df.groupby('FIRSTHARMFULEVENTID')['FIRSTHARMFULEVENTID'].count().sort_values(ascending=False)
factors
factors[:30].plot(kind='barh', figsize=(14, 10));
for factor in factors.index[:10]:
print
factor_df = df[df.FIRSTHARMFULEVENTID == factor]
print('{} : {} incidents,'.format(factor, len(factor_df)))
agg = ds.Canvas(750,750).points(factor_df, 'lon','lat')
display(tf.set_background(tf.shade(agg, cmap=fire),"black"))
df.loc[:, 'x'], df.loc[:, 'y'] = ds.utils.lnglat_to_meters(df.lon,df.lat)
hv.extension('bokeh')
url = 'https://server.arcgisonline.com/arcgis/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{z}/{y}/{x}'
tile_opts = dict(width=1000,height=600,xaxis=None,yaxis=None,bgcolor='black',show_grid=False)
map_tiles = gv.WMTS(url).opts(style=dict(alpha=0.5), plot=tile_opts)
points = hv.Points(df, ['x', 'y'])
map_data = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=1000, height=600)
map_tiles * map_data
df['hourcat'] = df['hour'].astype("category")
y_range = df.y.min(),df.y.max()
x_range = df.x.min(),df.x.max()
extent = x_range, y_range
plot_width = int(750)
plot_height = int(plot_width//1.2)
background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))
def base_plot(tools='pan,wheel_zoom,reset',plot_width=plot_width, plot_height=plot_height, **plot_args):
p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
x_range=x_range, y_range=y_range, outline_line_color=None,
min_border=0, min_border_left=0, min_border_right=0,
min_border_top=0, min_border_bottom=0, **plot_args)
p.axis.visible = False
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
return p
options = dict(line_color=None, fill_color='blue', size=5)
from bokeh.models import WMTSTileSource
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
"#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
"#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F",]
def colorized_images(x_range, y_range, w=plot_width, h=plot_height):
cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
agg = cvs.points(df, 'x', 'y', ds.count_cat('hourcat'))
img = tf.shade(agg, color_key=colors)
return tf.dynspread(img, threshold=0.3, max_px=4)
p = base_plot(background_fill_color=background)
p.add_tile(WMTSTileSource(url='https://server.arcgisonline.com/arcgis/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{z}/{y}/{x}'))
export(colorized_images(*extent),"NYC_collission_times")
InteractiveImage(p, colorized_images)
Here the order of colors is roughly red (midnight), yellow (4am), green (8am), cyan (noon), blue (4pm), purple (8pm), and back to red (since hours and colors are both cyclic).
df.ROADWAYJUNCTFEATUREID.unique()
basemap = WMTSTileSource(url='https://server.arcgisonline.com/arcgis/rest/services/Canvas/World_Light_Gray_Base/MapServer/tile/{z}/{y}/{x}')
def create_image_dfx(x_range, y_range, w=plot_width, h=plot_height):
cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
agg = cvs.points(dfx, 'x','y')
img = tf.shade(agg.where(agg>np.percentile(agg,90)), cmap=inferno, how='eq_hist')
return tf.dynspread(img, threshold=0.3, max_px=4)
dfx = df[df['ROADWAYJUNCTFEATUREID']=='Bike/Ped Path Intersection']
p = base_plot()
p.add_tile(basemap)
export(create_image_dfx( *extent),"BikePed Path Intersection")
InteractiveImage(p, create_image_dfx)
streets = df.groupby('RouteName')['RouteName']
count = streets.count()
count.sort_values(ascending=False, inplace=True)
print(count[:20])
Data sourced from https://data-uplan.opendata.arcgis.com/